Сonvert data to csv format to read it easily with R and remove unuseful columns from it
data <- read.csv("../data/IAD30284_384WellPlateDataSheet.csv", header = TRUE, skip = 14)
data <- data[data$Amplicon_ID != "Blank", ]
data$X384Well_Row <- NULL
data$X384Well_Col <- NULL
data$Gene_Symbol <- NULL
data$Genome_Version <- NULL
data$LineItem_Type <- NULL
Get amplicon sequence that lies between primers
library(jsonlite)
##
## Attaching package: 'jsonlite'
##
## The following object is masked from 'package:utils':
##
## View
library(httr)
n <- nrow(data)
chromosomes <- data$Chr
beginings <- data$Insert_Start
endings <- data$Insert_Stop
ampliconInnerPart <- rep("", n)
for (i in 1:n){
call <- paste(c("http://178.79.134.178/refservice/references/GRCh37.p13/sequence?contig=", as.character(chromosomes[i]),
"&start=",as.character(beginings[i]),"&end=",as.character(endings[i])), collapse = "")
call
jsonData <- fromJSON(call)
ampliconInnerPart[i] <- jsonData$sequence
}
head(ampliconInnerPart)
## [1] "TGTTTTCTTTGATCTTACAGTTGTTATTAATTGTGATTGGAGCTATAGCAGTTGTCGCAGTTTTACAACCCTACATCTTTGTTGCAACAGTGCCAGTGATAGTGGCTTTTATTATGTTGAGAGCATATTTCCTCCAAACCTCACAGCAACTCAAACAACTGGAATCTGAAG"
## [2] "AATGCATTAATGCTATTCTGATTCTATAATATGTTTTTGCTCTCTTTTATAAATAGGATTTCTTACAAAAGCAAGAATATAAGACATTGGAATATAACTTAACGACTACAGAAGTAGTGATGGA"
## [3] "ATGGGGCTAATCTGGGAGTTGTTACAGGCGTCTGCCTTCTGTGGACTTGGTTTCCTGATAGTCCTTGCCCTTTTTCAGGCTGGGCTAGGGAGAATGATGATGAAGTACAGGTAGCAACCTATTTTCATAACTTGAAAGTTTTAAAAATTATGTTTTCAAAAAGCCCAC"
## [4] "ACATAAAACAAGCATCTATTGAAAATATCTGACAAACTCATCTTTTATTTTTGATGTGTGTGTGTGTGTGTGTGTGTTTTTTTAACAGGGATTTGGGGAATTATTTGAGAAAGCAAAACAAAACAATAACAATAGAAAAACTTCTAATGGTGATGACAGCCTCTTCTTCAGT"
## [5] "CAGGCAAGGTAGTTCTTTTGTTCTTCACTATTAAGAACTTAATTTGGTGTCCATGTCTCTTTTTTTTTCTAGTTTGTAGTGCTGGAAGGTATTTTTGGAGAAATTCTTAC"
## [6] "AGTTCATTGACATGCCAACAGAAGGTAAACCTACCAAGTCAACCAAACCATACAAGAATGGCCAACTCTCGAAAGTTATGATTA"
data$Amplicon_Inner_Part <- ampliconInnerPart
Get dangling ends
danglingEndStart <- rep("", n)
danglingEndStop <- rep("", n)
for (i in 1:n){
call <- paste(c("http://178.79.134.178/refservice/references/GRCh37.p13/sequence?contig=", as.character(chromosomes[i]),
"&start=",as.character(beginings[i]-1),"&end=",as.character(beginings[i]-1)), collapse = "")
jsonData <- fromJSON(call)
danglingEndStart[i] <- jsonData$sequence
call <- paste(c("http://178.79.134.178/refservice/references/GRCh37.p13/sequence?contig=", as.character(chromosomes[i]),
"&start=",as.character(endings[i]+1),"&end=",as.character(endings[i]+1)), collapse = "")
jsonData <- fromJSON(call)
danglingEndStop[i] <- jsonData$sequence
}
data$Start_Dangling_End <- danglingEndStart
data$Stop_Dangling_End <- danglingEndStop
Load results and get a proportion for each
readResultFile <- function(name, addName = FALSE, delim = "\t"){
filePath <- paste(c("../data/", name), collapse = "")
result <- read.table(filePath, sep = delim, header = TRUE)
result$Gene <- NULL
result$sam <- NULL
result$X <- NULL
sums <- colSums(result[,-1])
proportion_result <- sweep(result[,-1], 2, sums, '/' )
#proportion_result <- sapply(proportion_result, function(x) log(x))
proportion_result$Means <- rowMeans(proportion_result)
proportion_result$Target <- result$Target
y <- data.frame(Target = character(127))
y$Target <- result$Target
ylog <- data.frame(Target = character(127))
ylog$Target <- result$Target
if (addName){
logresultStr <- paste(c(name,"_logresult"), collapse = "")
logMeansStr <- paste(c(name,"_logMeans"), collapse = "")
resultStr <- paste(c(name,"_result"), collapse = "")
MeansStr <- paste(c(name,"_Means"), collapse = "")
}else{
logresultStr <- "logresult"
logMeansStr <- "logMeans"
resultStr <- "result"
MeansStr <- "Means"
}
#ylog[resultStr] <- log(proportion_result[,1])
ylog[MeansStr] <- log(proportion_result$Means)
#y[resultStr] <- proportion_result[,1]
y[MeansStr] <- proportion_result$Means
ylog[ylog[MeansStr] == -Inf,][MeansStr] <- -10 # min(y[ylog$Means != -Inf,]$logMeans) - 20
#ylog[ylog[resultStr] == -Inf,][resultStr] <- -10 # min(y[ylog$result != -Inf,]$logresult) - 20
rownames(y) <- y$Target
rownames(ylog) <- ylog$Target
proportion <- proportion_result
proportion$Means <- NULL
proportion$Target <- NULL
rownames(proportion) <- proportion$Target
return(list(y = y, ylog = ylog, proportions = proportion))
}
y <- readResultFile("result.csv",FALSE, ",")$y
ylog <- readResultFile("result.csv",FALSE, ",")$ylog
#y <- readResultFile("SN1-21_Run_17.xls",FALSE)$y
#ylog <- readResultFile("SN1-21_Run_17.xls",FALSE)$ylog
resultFileNames <- c("SN1-17_Run_14.xls", "SN1-21_Run_17.xls","SN1-23-Run_18.xls","SN1-25-Run_19.xls", "SN1-26-Run_20.xls")
totalTable <- data.frame(Target = character(127))
totalTableOfMeans <- data.frame(Target = character(127))
totalTableOflogMeans <- data.frame(Target = character(127))
for (i in 1:length(resultFileNames)){
res <- readResultFile(resultFileNames[i], TRUE)
namesOfColumns <- names(res$proportions)
totalTable[namesOfColumns] <- res$proportions[namesOfColumns]
namesOfColumns <- names(res$y)
totalTableOfMeans[namesOfColumns] <- res$y
totalTableOflogMeans[namesOfColumns] <- res$ylog
}
totalTable$Target <- y$Target
totalTable$sd <- apply(totalTable[, -1], 1, sd)
totalTable$mean <- apply(totalTable[, -1], 1, mean)
totalTableOfMeans$sd <- apply(totalTableOfMeans[, -1], 1, sd)
totalTableOflogMeans$sd <- apply(totalTableOflogMeans[, -1], 1, sd)
rownames(totalTable) <- totalTable$Target
rownames(totalTableOfMeans) <- totalTableOfMeans$Target
rownames(totalTableOflogMeans) <- totalTableOflogMeans$Target
Prepare features
prepared_data <- data.frame(Target = character(172))
prepared_data$Target <- data$Amplicon_ID
prepared_data$Start_Dangling_End <- data$Start_Dangling_End
prepared_data$Stop_Dangling_End <- data$Stop_Dangling_End
prepared_data$Fwd_Primer_Len <- as.numeric(lapply(as.character(data$Ion_AmpliSeq_Fwd_Primer.), function(x) nchar(x)))
prepared_data$Rev_Primer_Len <- as.numeric(lapply(as.character(data$Ion_AmpliSeq_Rev_Primer.), function(x) nchar(x)))
prepared_data$Amplicon_Len <- as.numeric(lapply(as.character(data$Amplicon_Inner_Part), function(x) nchar(x)))
prepared_data$END_3 <- as.character(lapply(data$Amplicon_Inner_Part, function(x) substr(x, nchar(x), nchar(x))))
countCharOccurrences <- function(char, s) {s2 <- gsub(char,"",s);return (nchar(s) - nchar(s2))}
countGCcontent <- function(x){ gg <- countCharOccurrences('G',x); cc <-countCharOccurrences('C',x); aa <- countCharOccurrences('A',x); tt <- countCharOccurrences('T',x); (cc+gg)*100/(aa+tt+cc+gg) }
prepared_data$Fwd_Primer_GC_content <- as.numeric(lapply(as.character(data$Ion_AmpliSeq_Fwd_Primer.), function(x) countGCcontent(x)))
prepared_data$Rev_Primer_GC_content <- as.numeric(lapply(as.character(data$Ion_AmpliSeq_Rev_Primer.), function(x) countGCcontent(x)))
prepared_data$Amplicon_GC_content <- as.numeric(lapply(as.character(data$Amplicon_Inner_Part), function(x) countGCcontent(x)))
rownames(prepared_data) <- prepared_data$Target
valid_names <- intersect(y$Target, data$Amplicon_ID)
prepared_data <- prepared_data[valid_names,]
prepared_data$Start_Dangling_End <- as.factor(prepared_data$Start_Dangling_End)
prepared_data$Stop_Dangling_End <- as.factor(prepared_data$Stop_Dangling_End)
prepared_data$END_3 <- as.factor(prepared_data$END_3)
y <- y[valid_names,]
ylog <- ylog[valid_names,]
totalTable <- totalTable[valid_names,]
totalTableOfMeans <- totalTableOfMeans[valid_names,]
totalTableOflogMeans <- totalTableOflogMeans[valid_names,]
Graphics of dependencies of result on different features
library(lattice)
xyplot(ylog$Means ~ prepared_data$Amplicon_GC_content, xlab="GC-content(%)", ylab="log of amplicon percental amount", main="Amplicon GC-content")
xyplot(ylog$Means ~ prepared_data$Amplicon_Len, xlab="Length", ylab="log of amplicon percental amount", main="Amplicon length")
xyplot(ylog$Means ~ prepared_data$Fwd_Primer_GC_content, xlab="GC-content(%)", ylab="log of amplicon percental amount", main="Forward Primer GC-content")
xyplot(ylog$Means ~ prepared_data$Fwd_Primer_Len, xlab="Length", ylab="log of amplicon percental amount", main="Forward Primer length")
xyplot(ylog$Means ~ prepared_data$Rev_Primer_GC_content, xlab="GC-content(%)", ylab="log of amplicon percental amount", main="Reverse Primer GC-content")
xyplot(ylog$Means ~ prepared_data$Rev_Primer_Len, xlab="Length", ylab="log of amplicon percental amount", main="Reverse Primer length")
xyplot(y$Means ~ prepared_data$Amplicon_GC_content, xlab="GC-content(%)", ylab="amplicon percental amount", main="Amplicon GC-content")
xyplot(y$Means ~ prepared_data$Amplicon_Len, xlab="Length", ylab="amplicon percental amount", main="Amplicon length")
xyplot(y$Means ~ prepared_data$Fwd_Primer_GC_content, xlab="GC-content(%)", ylab="amplicon percental amount", main="Forward Primer GC-content")
xyplot(y$Means ~ prepared_data$Fwd_Primer_Len, xlab="Length", ylab="amplicon percental amount", main="Forward Primer length")
xyplot(y$Means ~ prepared_data$Rev_Primer_GC_content, xlab="GC-content(%)", ylab="amplicon percental amount", main="Reverse Primer GC-content")
xyplot(y$Means ~ prepared_data$Rev_Primer_Len, xlab="Length", ylab="amplicon percental amount", main="Reverse Primer length")
xyplot(totalTable$mean ~ prepared_data$Amplicon_GC_content, xlab="GC-content(%)", ylab="mean of amplicon percental amount", main="Amplicon GC-content")
xyplot(totalTable$mean ~ prepared_data$Amplicon_Len, xlab="Length", ylab="mean of amplicon percental amount", main="Amplicon length")
xyplot(totalTable$mean ~ prepared_data$Fwd_Primer_GC_content, xlab="GC-content(%)", ylab="mean of amplicon percental amount", main="Forward Primer GC-content")
xyplot(totalTable$mean ~ prepared_data$Fwd_Primer_Len, xlab="Length", ylab="mean of amplicon percental amount", main="Forward Primer length")
xyplot(totalTable$mean ~ prepared_data$Rev_Primer_GC_content, xlab="GC-content(%)", ylab="mean of amplicon percental amount", main="Reverse Primer GC-content")
xyplot(totalTable$mean ~ prepared_data$Rev_Primer_Len, xlab="Length", ylab="mean of amplicon percental amount", main="Reverse Primer length")
xyplot(totalTable$sd ~ prepared_data$Amplicon_GC_content, xlab="GC-content(%)", ylab="sd of amplicon percental amount", main="Amplicon GC-content")
xyplot(totalTable$sd ~ prepared_data$Amplicon_Len, xlab="Length", ylab="sd of amplicon percental amount", main="Amplicon length")
xyplot(totalTable$sd ~ prepared_data$Fwd_Primer_GC_content, xlab="GC-content(%)", ylab="sd of amplicon percental amount", main="Forward Primer GC-content")
xyplot(totalTable$sd ~ prepared_data$Fwd_Primer_Len, xlab="Length", ylab="sd of amplicon percental amount", main="Forward Primer length")
xyplot(totalTable$sd ~ prepared_data$Rev_Primer_GC_content, xlab="GC-content(%)", ylab="sd of amplicon percental amount", main="Reverse Primer GC-content")
xyplot(totalTable$sd ~ prepared_data$Rev_Primer_Len, xlab="Length", ylab="sd of amplicon percental amount", main="Reverse Primer length")
xyplot(log(totalTable$mean) ~ prepared_data$Amplicon_GC_content, xlab="GC-content(%)", ylab="mean of amplicon percental amount", main="Amplicon GC-content")
xyplot(log(totalTable$mean) ~ prepared_data$Amplicon_Len, xlab="Length", ylab="mean of amplicon percental amount", main="Amplicon length")
xyplot(log(totalTable$mean) ~ prepared_data$Fwd_Primer_GC_content, xlab="GC-content(%)", ylab="mean of amplicon percental amount", main="Forward Primer GC-content")
xyplot(log(totalTable$mean) ~ prepared_data$Fwd_Primer_Len, xlab="Length", ylab="mean of amplicon percental amount", main="Forward Primer length")
xyplot(log(totalTable$mean) ~ prepared_data$Rev_Primer_GC_content, xlab="GC-content(%)", ylab="mean of amplicon percental amount", main="Reverse Primer GC-content")
xyplot(log(totalTable$mean) ~ prepared_data$Rev_Primer_Len, xlab="Length", ylab="mean of amplicon percental amount", main="Reverse Primer length")
xyplot(totalTable$sd ~ prepared_data$Amplicon_GC_content, xlab="GC-content(%)", ylab="sd of amplicon percental amount", main="Amplicon GC-content")
xyplot(totalTable$sd ~ prepared_data$Amplicon_Len, xlab="Length", ylab="sd of amplicon percental amount", main="Amplicon length")
xyplot(totalTable$sd ~ prepared_data$Fwd_Primer_GC_content, xlab="GC-content(%)", ylab="sd of amplicon percental amount", main="Forward Primer GC-content")
xyplot(totalTable$sd ~ prepared_data$Fwd_Primer_Len, xlab="Length", ylab="sd of amplicon percental amount", main="Forward Primer length")
xyplot(totalTable$sd ~ prepared_data$Rev_Primer_GC_content, xlab="GC-content(%)", ylab="sd of amplicon percental amount", main="Reverse Primer GC-content")
xyplot(totalTable$sd ~ prepared_data$Rev_Primer_Len, xlab="Length", ylab="sd of amplicon percental amount", main="Reverse Primer length")
histogram(~ylog$Means|factor(prepared_data$Start_Dangling_End))
Graphics of dependencies of standard variation of result
bwplot(prepared_data$END_3 ~ ylog$Means, ylab="nucleotids", xlab="log of amplicon percental amount",
main="3' end")
bwplot(prepared_data$Start_Dangling_End ~ ylog$Means, ylab="nucleotids", xlab="log of amplicon percental amount",
main="First dangling end")
bwplot(prepared_data$Stop_Dangling_End ~ ylog$Means, ylab="nucleotids", xlab="log of amplicon percental amount",
main="Second dangling end")
bwplot(prepared_data$END_3 ~ y$Means, ylab="nucleotids", xlab="amplicon percental amount",
main="3' end")
bwplot(prepared_data$Start_Dangling_End ~ y$Means, ylab="nucleotids", xlab="amplicon percental amount",
main="First dangling end")
bwplot(prepared_data$Stop_Dangling_End ~ y$Means, ylab="nucleotids", xlab="amplicon percental amount",
main="Second dangling end")
bwplot(prepared_data$END_3 ~ totalTable$sd, ylab="nucleotids", xlab="standard deviation of amplicon percental amount",
main="3' end")
bwplot(prepared_data$Start_Dangling_End ~ totalTable$sd, ylab="nucleotids", xlab="standard deviation of amplicon percental amount",
main="First dangling end")
bwplot(prepared_data$Stop_Dangling_End ~ totalTable$sd, ylab="nucleotids", xlab="standard deviation of amplicon percental amount",
main="Second dangling end")
bwplot(prepared_data$END_3 ~ totalTable$mean, ylab="nucleotids", xlab="mean of amplicon percental amount",
main="3' end")
bwplot(prepared_data$Start_Dangling_End ~ totalTable$mean, ylab="nucleotids", xlab="mean of amplicon percental amount",
main="First dangling end")
bwplot(prepared_data$Stop_Dangling_End ~ totalTable$mean, ylab="nucleotids", xlab="mean of amplicon percental amount",
main="Second dangling end")
amplicon_len <- prepared_data$Amplicon_Len
q <- quantile(amplicon_len, probs = c(1/4, 1/2, 3/4, 1))
amplicon_len[amplicon_len <= q[1]] <- q[1]
amplicon_len[amplicon_len > q[1] & amplicon_len <= q[2]] <- q[2]
amplicon_len[amplicon_len > q[2] & amplicon_len <= q[3]] <- q[3]
amplicon_len[amplicon_len > q[3]] <- q[4]
histogram(amplicon_len)
bwplot(amplicon_len ~ y$Means, ylab="length", xlab="amplicon percental amount",
main="Amplicon length")
bwplot(amplicon_len ~ ylog$Means, ylab="length", xlab="log of amplicon percental amount",
main="Amplicon length")
bwplot(amplicon_len ~ totalTable$sd, ylab="length", xlab="standard deviation of amplicon percental amount",
main="Amplicon length")
bwplot(amplicon_len ~ totalTable$mean, ylab="length", xlab="mean of amplicon percental amount",
main="Amplicon length")
amplicon_gc <- prepared_data$Amplicon_GC_content
q <- quantile(amplicon_gc, probs = c(1/4, 1/2, 3/4, 1))
amplicon_gc[amplicon_gc <= q[1]] <- q[1]
amplicon_gc[amplicon_gc > q[1] & amplicon_gc <= q[2]] <- q[2]
amplicon_gc[amplicon_gc > q[2] & amplicon_gc <= q[3]] <- q[3]
amplicon_gc[amplicon_gc > q[3]] <- q[4]
histogram(amplicon_gc)
bwplot(amplicon_gc ~ y$Means, ylab="GC-content", xlab="amplicon percental amount",
main="Amplicon GC-content")
bwplot(amplicon_gc ~ ylog$Means, ylab="GC-content", xlab="log of amplicon percental amount",
main="Amplicon GC-content")
bwplot(amplicon_gc ~ totalTable$sd, ylab="GC-content", xlab="standard deviation of amplicon percental amount",
main="Amplicon GC-content")
bwplot(amplicon_gc ~ totalTable$mean, ylab="GC-content", xlab="mean of amplicon percental amount",
main="Amplicon GC-content")
primer_len <- prepared_data$Fwd_Primer_Len
q <- quantile(primer_len, probs = c(1/4, 1/2, 3/4, 1))
primer_len[primer_len <= q[1]] <- q[1]
primer_len[primer_len > q[1] & primer_len <= q[2]] <- q[2]
primer_len[primer_len > q[2] & primer_len <= q[3]] <- q[3]
primer_len[primer_len > q[3]] <- q[4]
histogram(primer_len)
bwplot(primer_len ~ y$Means, ylab="length", xlab="primer percental amount",
main="Forward Primer length")
bwplot(primer_len ~ ylog$Means, ylab="length", xlab="log of amplicon percental amount",
main="Forward Primer length")
bwplot(primer_len ~ totalTable$sd, ylab="length", xlab="standard deviation of amplicon percental amount",
main="Forward Primer length")
bwplot(primer_len ~ totalTable$mean, ylab="length", xlab="mean of amplicon percental amount",
main="Forward Primer length")
primer_gc <- prepared_data$Fwd_Primer_GC_content
q <- quantile(primer_gc, probs = c(1/4, 1/2, 3/4, 1))
primer_gc[primer_gc <= q[1]] <- q[1]
primer_gc[primer_gc > q[1] & primer_gc <= q[2]] <- q[2]
primer_gc[primer_gc > q[2] & primer_gc <= q[3]] <- q[3]
primer_gc[primer_gc > q[3]] <- q[4]
bwplot(primer_gc ~ y$Means, ylab="GC-content", xlab="primer percental amount",
main="Forward Primer GC-content")
bwplot(primer_gc ~ ylog$Means, ylab="GC-content", xlab="log of amplicon percental amount",
main="Forward Primer GC-content")
bwplot(primer_gc ~ totalTable$sd, ylab="GC-content", xlab="standard deviation of amplicon percental amount",
main="Forward Primer GC-content")
bwplot(primer_gc ~ totalTable$mean, ylab="GC-content", xlab="mean of amplicon percental amount",
main="Forward Primer GC-content")
primer_len <- prepared_data$Rev_Primer_Len
q <- quantile(primer_len, probs = c(1/4, 1/2, 3/4, 1))
primer_len[primer_len <= q[1]] <- q[1]
primer_len[primer_len > q[1] & primer_len <= q[2]] <- q[2]
primer_len[primer_len > q[2] & primer_len <= q[3]] <- q[3]
primer_len[primer_len > q[3]] <- q[4]
histogram(primer_len)
bwplot(primer_len ~ y$Means, ylab="length", xlab="primer percental amount",
main="Reverse Primer length")
bwplot(primer_len ~ ylog$Means, ylab="length", xlab="log of amplicon percental amount",
main="Reverse Primer length")
bwplot(primer_len ~ totalTable$sd, ylab="length", xlab="standard deviation of amplicon percental amount",
main="Reverse Primer length")
bwplot(primer_len ~ totalTable$mean, ylab="length", xlab="mean of amplicon percental amount",
main="Reverse Primer length")
primer_gc <- prepared_data$Rev_Primer_GC_content
q <- quantile(primer_gc, probs = c(1/4, 1/2, 3/4, 1))
primer_gc[primer_gc <= q[1]] <- q[1]
primer_gc[primer_gc > q[1] & primer_gc <= q[2]] <- q[2]
primer_gc[primer_gc > q[2] & primer_gc <= q[3]] <- q[3]
primer_gc[primer_gc > q[3]] <- q[4]
bwplot(primer_gc ~ y$Means, ylab="GC-content", xlab="primer percental amount",
main="Reverse Primer GC-content")
bwplot(primer_gc ~ ylog$Means, ylab="GC-content", xlab="log of amplicon percental amount",
main="Reverse Primer GC-content")
bwplot(primer_gc ~ totalTable$sd, ylab="GC-content", xlab="standard deviation of amplicon percental amount",
main="Reverse Primer GC-content")
bwplot(primer_gc ~ totalTable$mean, ylab="GC-content", xlab="mean of amplicon percental amount",
main="Reverse Primer GC-content")
prepared_data$res <- y$Means
prepared_data$logres <- ylog$Means
prepared_data$sd <- totalTable$sd
prepared_data$mean <- totalTable$mean
prepared_data$MeansSD <- totalTableOfMeans$sd
write.table(prepared_data, "../data/features.tsv", col.names = TRUE, sep = "\t", row.names = FALSE)
Use all features to train model
library(randomForest)
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
splitdf <- function(dataframe, n = 4, seed=NULL) {
if (!is.null(seed)) set.seed(seed)
index <- 1:nrow(dataframe)
trainindex <- sample(index, trunc(length(index)/n))
trainset <- dataframe[trainindex, ]
testset <- dataframe[-trainindex, ]
list(trainset=trainset,testset=testset)
}
splited <- splitdf(prepared_data)
trainSet <- splited$trainset
testSet <- splited$testset
formula <- res ~ Amplicon_Len + Amplicon_GC_content + END_3+Start_Dangling_End+Stop_Dangling_End + Fwd_Primer_Len +Fwd_Primer_GC_content+ Rev_Primer_Len +Rev_Primer_GC_content
rf <- randomForest(formula, trainSet)
importance(rf)
## IncNodePurity
## Amplicon_Len 1.297057e-04
## Amplicon_GC_content 1.039724e-04
## END_3 4.852462e-05
## Start_Dangling_End 6.530856e-05
## Stop_Dangling_End 4.758021e-05
## Fwd_Primer_Len 3.321257e-05
## Fwd_Primer_GC_content 4.192514e-05
## Rev_Primer_Len 6.474327e-05
## Rev_Primer_GC_content 5.377356e-05
fit <- lm(formula, data=trainSet)
#plot(fit)
predicted <- predict(fit, newdata = testSet)
rmse <- function(error)
{
sqrt(mean(error^2))
}
sd(testSet$res)
## [1] 0.004352934
rmse(predicted - testSet$res)
## [1] 0.00601293
predicted <- predict(rf, newdata = testSet)
rmse(predicted - testSet$res)
## [1] 0.004024284
Use only amplicon information and 3’ end to train model
formula <- mean ~ Amplicon_Len + Amplicon_GC_content + END_3+Start_Dangling_End+Stop_Dangling_End + Fwd_Primer_Len +Fwd_Primer_GC_content+ Rev_Primer_Len +Rev_Primer_GC_content
rf <- randomForest(formula, trainSet)
importance(rf)
## IncNodePurity
## Amplicon_Len 9.513483e-05
## Amplicon_GC_content 8.769304e-05
## END_3 4.572461e-05
## Start_Dangling_End 2.824424e-05
## Stop_Dangling_End 2.904317e-05
## Fwd_Primer_Len 2.524799e-05
## Fwd_Primer_GC_content 2.820977e-05
## Rev_Primer_Len 6.203297e-05
## Rev_Primer_GC_content 5.307879e-05
fit <- lm(formula, data=trainSet)
#plot(fit)
predicted <- predict(fit, newdata = testSet)
rmse(predicted - testSet$mean)
## [1] 0.005290301
predicted <- predict(rf, newdata = testSet)
rmse(predicted - testSet$mean)
## [1] 0.003831432
Use only amplicon information and 3’ end to train model
formula <- sd ~ Amplicon_Len + Amplicon_GC_content + END_3+Start_Dangling_End+Stop_Dangling_End + Fwd_Primer_Len +Fwd_Primer_GC_content+ Rev_Primer_Len +Rev_Primer_GC_content
rf <- randomForest(formula, trainSet)
importance(rf)
## IncNodePurity
## Amplicon_Len 2.343244e-05
## Amplicon_GC_content 8.228964e-06
## END_3 1.205106e-05
## Start_Dangling_End 3.004172e-06
## Stop_Dangling_End 2.068637e-06
## Fwd_Primer_Len 7.137126e-06
## Fwd_Primer_GC_content 6.770621e-06
## Rev_Primer_Len 4.540529e-06
## Rev_Primer_GC_content 5.659024e-06
fit <- lm(formula, data=trainSet)
#plot(fit)
predicted <- predict(fit, newdata = testSet)
rmse(predicted - testSet$sd)
## [1] 0.002147819
predicted <- predict(rf, newdata = testSet)
rmse(predicted - testSet$sd)
## [1] 0.00193279
Use only amplicon information and 3’ end to train model
formula <- res ~ Amplicon_Len + Amplicon_GC_content + END_3
rf <- randomForest(formula, trainSet)
importance(rf)
## IncNodePurity
## Amplicon_Len 0.0002155846
## Amplicon_GC_content 0.0001887638
## END_3 0.0001117014
fit <- lm(formula, data=trainSet)
#plot(fit)
predicted <- predict(fit, newdata = testSet)
rmse(predicted - testSet$res)
## [1] 0.003903581
predicted <- predict(rf, newdata = testSet)
rmse(predicted - testSet$res)
## [1] 0.003750015
Use only amplicon information to train model
formula <- res ~ Amplicon_Len + Amplicon_GC_content
rf <- randomForest(formula, trainSet)
importance(rf)
## IncNodePurity
## Amplicon_Len 0.0002841399
## Amplicon_GC_content 0.0002689326
fit <- lm(formula, data=trainSet)
#plot(fit)
predicted <- predict(fit, newdata = testSet)
mean(testSet$res)
## [1] 0.007852893
rmse(predicted - testSet$res)
## [1] 0.003824522
predicted <- predict(rf, newdata = testSet)
rmse(predicted - testSet$res)
## [1] 0.003651368
Use other to train model
formula <- res ~ END_3+Start_Dangling_End+Stop_Dangling_End + Fwd_Primer_Len +Fwd_Primer_GC_content+Rev_Primer_Len +Rev_Primer_GC_content
rf <- randomForest(formula, trainSet)
importance(rf)
## IncNodePurity
## END_3 6.899303e-05
## Start_Dangling_End 8.206081e-05
## Stop_Dangling_End 7.034166e-05
## Fwd_Primer_Len 5.873686e-05
## Fwd_Primer_GC_content 7.620549e-05
## Rev_Primer_Len 1.059429e-04
## Rev_Primer_GC_content 9.323628e-05
fit <- lm(formula, data=trainSet)
#plot(fit)
predicted <- predict(fit, newdata = testSet)
rmse(predicted - testSet$res)
## [1] 0.006482053
predicted <- predict(rf, newdata = testSet)
rmse(predicted - testSet$res)
## [1] 0.004897928
Different feature selection tests
library(FSelector)
formula <- res ~ Amplicon_Len + Amplicon_GC_content + END_3+Start_Dangling_End+Stop_Dangling_End + Fwd_Primer_Len +Fwd_Primer_GC_content+ Rev_Primer_Len +Rev_Primer_GC_content
chi.squared(formula, data = prepared_data)
## attr_importance
## Amplicon_Len 0.4927980
## Amplicon_GC_content 0.4013567
## END_3 0.1636325
## Start_Dangling_End 0.2292457
## Stop_Dangling_End 0.1955937
## Fwd_Primer_Len 0.0000000
## Fwd_Primer_GC_content 0.0000000
## Rev_Primer_Len 0.0000000
## Rev_Primer_GC_content 0.0000000
chi <- chi.squared(formula, data = prepared_data)
chi$names <- rownames(chi)
chi <- chi[order(-chi$attr_importance), ]
barchart( chi$names ~ chi$attr_importance)
information.gain(formula, data = prepared_data)
## attr_importance
## Amplicon_Len 0.20481515
## Amplicon_GC_content 0.20752242
## END_3 0.05912807
## Start_Dangling_End 0.11438548
## Stop_Dangling_End 0.09951913
## Fwd_Primer_Len 0.00000000
## Fwd_Primer_GC_content 0.00000000
## Rev_Primer_Len 0.00000000
## Rev_Primer_GC_content 0.00000000
gain.ratio(formula, data = prepared_data)
## attr_importance
## Amplicon_Len 0.20575099
## Amplicon_GC_content 0.35674389
## END_3 0.03013050
## Start_Dangling_End 0.06257012
## Stop_Dangling_End 0.05606093
## Fwd_Primer_Len NaN
## Fwd_Primer_GC_content NaN
## Rev_Primer_Len NaN
## Rev_Primer_GC_content NaN
symmetrical.uncertainty(formula, data = prepared_data)
## attr_importance
## Amplicon_Len 0.12348679
## Amplicon_GC_content 0.14294834
## END_3 0.02760319
## Start_Dangling_End 0.05512733
## Stop_Dangling_End 0.04858213
## Fwd_Primer_Len 0.00000000
## Fwd_Primer_GC_content 0.00000000
## Rev_Primer_Len 0.00000000
## Rev_Primer_GC_content 0.00000000
oneR(formula, data = prepared_data)
## attr_importance
## Amplicon_Len 2.3739993
## Amplicon_GC_content 2.6550875
## END_3 0.7936508
## Start_Dangling_End 0.9641420
## Stop_Dangling_End 0.9178561
## Fwd_Primer_Len 2.9170625
## Fwd_Primer_GC_content 2.9170625
## Rev_Primer_Len 2.9170625
## Rev_Primer_GC_content 2.9170625
random.forest.importance(formula, data = prepared_data, importance.type = 1)
## attr_importance
## Amplicon_Len 31.7515991
## Amplicon_GC_content 29.6047708
## END_3 -2.6640769
## Start_Dangling_End 0.1273096
## Stop_Dangling_End -3.1139932
## Fwd_Primer_Len 6.6302716
## Fwd_Primer_GC_content 6.1315344
## Rev_Primer_Len 3.0663363
## Rev_Primer_GC_content 0.1135314
Different feature selection tests
library(FSelector)
formula <- MeansSD ~ Amplicon_Len + Amplicon_GC_content + END_3+Start_Dangling_End+Stop_Dangling_End + Fwd_Primer_Len +Fwd_Primer_GC_content+ Rev_Primer_Len +Rev_Primer_GC_content
chi.squared(formula, data = prepared_data)
## attr_importance
## Amplicon_Len 0.0000000
## Amplicon_GC_content 0.5266193
## END_3 0.1729355
## Start_Dangling_End 0.1274934
## Stop_Dangling_End 0.1298428
## Fwd_Primer_Len 0.0000000
## Fwd_Primer_GC_content 0.0000000
## Rev_Primer_Len 0.0000000
## Rev_Primer_GC_content 0.0000000
chi <- chi.squared(formula, data = prepared_data)
chi$names <- rownames(chi)
chi <- chi[order(-chi$attr_importance), ]
barchart( chi$names ~ chi$attr_importance)
information.gain(formula, data = prepared_data)
## attr_importance
## Amplicon_Len 0.00000000
## Amplicon_GC_content 0.18033461
## END_3 0.06875444
## Start_Dangling_End 0.03639421
## Stop_Dangling_End 0.03419287
## Fwd_Primer_Len 0.00000000
## Fwd_Primer_GC_content 0.00000000
## Rev_Primer_Len 0.00000000
## Rev_Primer_GC_content 0.00000000
gain.ratio(formula, data = prepared_data)
## attr_importance
## Amplicon_Len NaN
## Amplicon_GC_content 0.39746114
## END_3 0.03503591
## Start_Dangling_End 0.01990803
## Stop_Dangling_End 0.01926146
## Fwd_Primer_Len NaN
## Fwd_Primer_GC_content NaN
## Rev_Primer_Len NaN
## Rev_Primer_GC_content NaN
symmetrical.uncertainty(formula, data = prepared_data)
## attr_importance
## Amplicon_Len 0.00000000
## Amplicon_GC_content 0.12994916
## END_3 0.03209714
## Start_Dangling_End 0.01753995
## Stop_Dangling_End 0.01669189
## Fwd_Primer_Len 0.00000000
## Fwd_Primer_GC_content 0.00000000
## Rev_Primer_Len 0.00000000
## Rev_Primer_GC_content 0.00000000
oneR(formula, data = prepared_data)
## attr_importance
## Amplicon_Len 2.9535005
## Amplicon_GC_content 2.7997829
## END_3 0.8917744
## Start_Dangling_End 0.7936508
## Stop_Dangling_End 0.9320720
## Fwd_Primer_Len 2.9535005
## Fwd_Primer_GC_content 2.9535005
## Rev_Primer_Len 2.9535005
## Rev_Primer_GC_content 2.9535005
random.forest.importance(formula, data = prepared_data, importance.type = 1)
## attr_importance
## Amplicon_Len 5.41421199
## Amplicon_GC_content 10.41942249
## END_3 -4.80328263
## Start_Dangling_End -1.41766512
## Stop_Dangling_End -2.78939925
## Fwd_Primer_Len 4.09201432
## Fwd_Primer_GC_content 6.75474582
## Rev_Primer_Len 3.57277811
## Rev_Primer_GC_content -0.08268964
Different feature selection tests
library(FSelector)
histogram(prepared_data$sd)
sd(prepared_data$sd)
## [1] 0.001848055
formula <- sd ~ Amplicon_Len + Amplicon_GC_content + END_3+Start_Dangling_End+Stop_Dangling_End + Fwd_Primer_Len +Fwd_Primer_GC_content+ Rev_Primer_Len +Rev_Primer_GC_content
chi.squared(formula, data = prepared_data)
## attr_importance
## Amplicon_Len 0.0000000
## Amplicon_GC_content 0.0000000
## END_3 0.1441501
## Start_Dangling_End 0.1902519
## Stop_Dangling_End 0.1559919
## Fwd_Primer_Len 0.0000000
## Fwd_Primer_GC_content 0.0000000
## Rev_Primer_Len 0.0000000
## Rev_Primer_GC_content 0.0000000
chi <- chi.squared(formula, data = prepared_data)
chi$names <- rownames(chi)
chi <- chi[order(-chi$attr_importance), ]
barchart( chi$names ~ chi$attr_importance)
information.gain(formula, data = prepared_data)
## attr_importance
## Amplicon_Len 0.00000000
## Amplicon_GC_content 0.00000000
## END_3 0.04685588
## Start_Dangling_End 0.09190803
## Stop_Dangling_End 0.05322211
## Fwd_Primer_Len 0.00000000
## Fwd_Primer_GC_content 0.00000000
## Rev_Primer_Len 0.00000000
## Rev_Primer_GC_content 0.00000000
gain.ratio(formula, data = prepared_data)
## attr_importance
## Amplicon_Len NaN
## Amplicon_GC_content NaN
## END_3 0.02387683
## Start_Dangling_End 0.05027470
## Stop_Dangling_End 0.02998098
## Fwd_Primer_Len NaN
## Fwd_Primer_GC_content NaN
## Rev_Primer_Len NaN
## Rev_Primer_GC_content NaN
symmetrical.uncertainty(formula, data = prepared_data)
## attr_importance
## Amplicon_Len 0.00000000
## Amplicon_GC_content 0.00000000
## END_3 0.02187407
## Start_Dangling_End 0.04429447
## Stop_Dangling_End 0.02598137
## Fwd_Primer_Len 0.00000000
## Fwd_Primer_GC_content 0.00000000
## Rev_Primer_Len 0.00000000
## Rev_Primer_GC_content 0.00000000
oneR(formula, data = prepared_data)
## attr_importance
## Amplicon_Len 2.8965548
## Amplicon_GC_content 2.8965548
## END_3 0.7936508
## Start_Dangling_End 0.9049111
## Stop_Dangling_End 0.8319027
## Fwd_Primer_Len 2.8965548
## Fwd_Primer_GC_content 2.8965548
## Rev_Primer_Len 2.8965548
## Rev_Primer_GC_content 2.8965548
random.forest.importance(formula, data = prepared_data, importance.type = 1)
## attr_importance
## Amplicon_Len 4.9995589
## Amplicon_GC_content 2.8925120
## END_3 -3.6284379
## Start_Dangling_End 1.3762546
## Stop_Dangling_End -6.7916164
## Fwd_Primer_Len -2.6647553
## Fwd_Primer_GC_content -0.7990219
## Rev_Primer_Len 9.2348298
## Rev_Primer_GC_content 13.9168295
library(FSelector)
histogram(prepared_data$mean)
sd(prepared_data$mean)
## [1] 0.003951656
formula <- mean ~ Amplicon_Len + Amplicon_GC_content + END_3+Start_Dangling_End+Stop_Dangling_End + Fwd_Primer_Len +Fwd_Primer_GC_content+ Rev_Primer_Len +Rev_Primer_GC_content
chi.squared(formula, data = prepared_data)
## attr_importance
## Amplicon_Len 0.5359976
## Amplicon_GC_content 0.5601954
## END_3 0.2088283
## Start_Dangling_End 0.1638039
## Stop_Dangling_End 0.2313565
## Fwd_Primer_Len 0.0000000
## Fwd_Primer_GC_content 0.0000000
## Rev_Primer_Len 0.0000000
## Rev_Primer_GC_content 0.0000000
chi <- chi.squared(formula, data = prepared_data)
chi$names <- rownames(chi)
chi <- chi[order(-chi$attr_importance), ]
barchart( chi$names ~ chi$attr_importance)
information.gain(formula, data = prepared_data)
## attr_importance
## Amplicon_Len 0.23081264
## Amplicon_GC_content 0.20283463
## END_3 0.09499156
## Start_Dangling_End 0.06088759
## Stop_Dangling_End 0.12678523
## Fwd_Primer_Len 0.00000000
## Fwd_Primer_GC_content 0.00000000
## Rev_Primer_Len 0.00000000
## Rev_Primer_GC_content 0.00000000
gain.ratio(formula, data = prepared_data)
## attr_importance
## Amplicon_Len 0.23510367
## Amplicon_GC_content 0.42347307
## END_3 0.04840583
## Start_Dangling_End 0.03330618
## Stop_Dangling_End 0.07142041
## Fwd_Primer_Len NaN
## Fwd_Primer_GC_content NaN
## Rev_Primer_Len NaN
## Rev_Primer_GC_content NaN
symmetrical.uncertainty(formula, data = prepared_data)
## attr_importance
## Amplicon_Len 0.13973841
## Amplicon_GC_content 0.14484429
## END_3 0.04434561
## Start_Dangling_End 0.02934438
## Stop_Dangling_End 0.06189259
## Fwd_Primer_Len 0.00000000
## Fwd_Primer_GC_content 0.00000000
## Rev_Primer_Len 0.00000000
## Rev_Primer_GC_content 0.00000000
oneR(formula, data = prepared_data)
## attr_importance
## Amplicon_Len 2.4144563
## Amplicon_GC_content 2.7553665
## END_3 0.8749454
## Start_Dangling_End 0.7936508
## Stop_Dangling_End 1.0225918
## Fwd_Primer_Len 2.8854733
## Fwd_Primer_GC_content 2.8854733
## Rev_Primer_Len 2.8854733
## Rev_Primer_GC_content 2.8854733
random.forest.importance(formula, data = prepared_data, importance.type = 1)
## attr_importance
## Amplicon_Len 15.4388714
## Amplicon_GC_content 30.9788026
## END_3 -2.3855893
## Start_Dangling_End -1.9192058
## Stop_Dangling_End -4.3669253
## Fwd_Primer_Len 0.4986978
## Fwd_Primer_GC_content 1.4462451
## Rev_Primer_Len 3.6241943
## Rev_Primer_GC_content 0.5760765
Feature subset selection tests
library(corrplot)
prepared_data_num <- prepared_data
prepared_data_num$Start_Dangling_End <- as.numeric(prepared_data_num$Start_Dangling_End)
prepared_data_num$Stop_Dangling_End <- as.numeric(prepared_data_num$Stop_Dangling_End)
prepared_data_num$END_3 <- as.numeric(prepared_data_num$END_3)
prepared_data.scale<- scale(prepared_data_num[2:ncol(prepared_data_num)], center=TRUE, scale=TRUE)
cor_prepared_data <- cor(prepared_data.scale)
corrplot(cor_prepared_data, order = "hclust")
library(cvTools)
## Loading required package: robustbase
library(caret)
## Loading required package: ggplot2
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:httr':
##
## progress
# control <- trainControl(method="repeatedcv", number=10, repeats=3)
# model <- train(res~., data=prepared_data, method="lvq", preProcess="scale", trControl=control)
# # estimate variable importance
# importance <- varImp(model, scale=FALSE)
# # summarize importance
# print(importance)
# # plot importance
# plot(importance)
evaluator <- function(subset) {
formula <- res ~ .
valid_names <- c(subset,"res")
rf <- randomForest(formula, prepared_data[,valid_names])
result <- cvFit(rf, data = prepared_data[,valid_names], y = prepared_data[,valid_names]$res, cost = rtmspe,
K = 5, R = 10, costArgs = list(trim = 0.1), seed = 1234)
return(1-result$cv)
}
#perform the best subset search
#subset <- exhaustive.search(names(prepared_data[-c(1, 11)]), evaluator)
#subset <- best.first.search(names(prepared_data[-c(1, 11)]), evaluator)